HW1

Author

Patrick Casanas

Step 1

library(data.table)
library(magrittr)
library(leaflet)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:data.table':

    between, first, last
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
data_2002 <- fread("C:\\Users\\patri\\OneDrive\\Documents\\MPH Sem 3\\Health Data Science\\PM566HW\\2002_data.csv")
data_2022 <- fread("C:\\Users\\patri\\OneDrive\\Documents\\MPH Sem 3\\Health Data Science\\PM566HW\\2022_data.csv")

data_table_2002 <- data.table(data_2002)
data_table_2022 <- data.table(data_2022)
head(data_table_2002)
         Date Source  Site ID   POC Daily Mean PM2.5 Concentration    Units
       <char> <char>    <int> <int>                          <num>   <char>
1: 01/05/2002    AQS 60010007     1                           25.1 ug/m3 LC
2: 01/06/2002    AQS 60010007     1                           31.6 ug/m3 LC
3: 01/08/2002    AQS 60010007     1                           21.4 ug/m3 LC
4: 01/11/2002    AQS 60010007     1                           25.9 ug/m3 LC
5: 01/14/2002    AQS 60010007     1                           34.5 ug/m3 LC
6: 01/17/2002    AQS 60010007     1                           41.0 ug/m3 LC
   Daily AQI Value Local Site Name Daily Obs Count Percent Complete
             <int>          <char>           <int>            <num>
1:              81       Livermore               1              100
2:              93       Livermore               1              100
3:              74       Livermore               1              100
4:              82       Livermore               1              100
5:              98       Livermore               1              100
6:             115       Livermore               1              100
   AQS Parameter Code AQS Parameter Description Method Code
                <int>                    <char>       <int>
1:              88101  PM2.5 - Local Conditions         120
2:              88101  PM2.5 - Local Conditions         120
3:              88101  PM2.5 - Local Conditions         120
4:              88101  PM2.5 - Local Conditions         120
5:              88101  PM2.5 - Local Conditions         120
6:              88101  PM2.5 - Local Conditions         120
                      Method Description CBSA Code
                                  <char>     <int>
1: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
2: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
3: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
4: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
5: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
6: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
                           CBSA Name State FIPS Code      State
                              <char>           <int>     <char>
1: San Francisco-Oakland-Hayward, CA               6 California
2: San Francisco-Oakland-Hayward, CA               6 California
3: San Francisco-Oakland-Hayward, CA               6 California
4: San Francisco-Oakland-Hayward, CA               6 California
5: San Francisco-Oakland-Hayward, CA               6 California
6: San Francisco-Oakland-Hayward, CA               6 California
   County FIPS Code  County Site Latitude Site Longitude
              <int>  <char>         <num>          <num>
1:                1 Alameda      37.68753      -121.7842
2:                1 Alameda      37.68753      -121.7842
3:                1 Alameda      37.68753      -121.7842
4:                1 Alameda      37.68753      -121.7842
5:                1 Alameda      37.68753      -121.7842
6:                1 Alameda      37.68753      -121.7842
head(data_table_2022)
         Date Source  Site ID   POC Daily Mean PM2.5 Concentration    Units
       <char> <char>    <int> <int>                          <num>   <char>
1: 01/01/2022    AQS 60010007     3                           12.7 ug/m3 LC
2: 01/02/2022    AQS 60010007     3                           13.9 ug/m3 LC
3: 01/03/2022    AQS 60010007     3                            7.1 ug/m3 LC
4: 01/04/2022    AQS 60010007     3                            3.7 ug/m3 LC
5: 01/05/2022    AQS 60010007     3                            4.2 ug/m3 LC
6: 01/06/2022    AQS 60010007     3                            3.8 ug/m3 LC
   Daily AQI Value Local Site Name Daily Obs Count Percent Complete
             <int>          <char>           <int>            <num>
1:              58       Livermore               1              100
2:              60       Livermore               1              100
3:              39       Livermore               1              100
4:              21       Livermore               1              100
5:              23       Livermore               1              100
6:              21       Livermore               1              100
   AQS Parameter Code AQS Parameter Description Method Code
                <int>                    <char>       <int>
1:              88101  PM2.5 - Local Conditions         170
2:              88101  PM2.5 - Local Conditions         170
3:              88101  PM2.5 - Local Conditions         170
4:              88101  PM2.5 - Local Conditions         170
5:              88101  PM2.5 - Local Conditions         170
6:              88101  PM2.5 - Local Conditions         170
                     Method Description CBSA Code
                                 <char>     <int>
1: Met One BAM-1020 Mass Monitor w/VSCC     41860
2: Met One BAM-1020 Mass Monitor w/VSCC     41860
3: Met One BAM-1020 Mass Monitor w/VSCC     41860
4: Met One BAM-1020 Mass Monitor w/VSCC     41860
5: Met One BAM-1020 Mass Monitor w/VSCC     41860
6: Met One BAM-1020 Mass Monitor w/VSCC     41860
                           CBSA Name State FIPS Code      State
                              <char>           <int>     <char>
1: San Francisco-Oakland-Hayward, CA               6 California
2: San Francisco-Oakland-Hayward, CA               6 California
3: San Francisco-Oakland-Hayward, CA               6 California
4: San Francisco-Oakland-Hayward, CA               6 California
5: San Francisco-Oakland-Hayward, CA               6 California
6: San Francisco-Oakland-Hayward, CA               6 California
   County FIPS Code  County Site Latitude Site Longitude
              <int>  <char>         <num>          <num>
1:                1 Alameda      37.68753      -121.7842
2:                1 Alameda      37.68753      -121.7842
3:                1 Alameda      37.68753      -121.7842
4:                1 Alameda      37.68753      -121.7842
5:                1 Alameda      37.68753      -121.7842
6:                1 Alameda      37.68753      -121.7842

Checking dimensions, headers, footers, variable names, and variable types

dimensions_2002 <- dim(data_table_2002)
dimensions_2022 <- dim(data_table_2022)
dimensions_info <- data.table(
  Dataset = c("2002 Data", "2022 Data"),
  Rows = c(nrow(data_2002), nrow(data_2022)),
  Columns = c(ncol(data_2002), ncol(data_2022))
)
print(dimensions_info)
     Dataset  Rows Columns
      <char> <int>   <int>
1: 2002 Data 15976      22
2: 2022 Data 59756      22

Checking header and footer

print("2002 Data Header and Footer")
[1] "2002 Data Header and Footer"
print (head(data_table_2002))
         Date Source  Site ID   POC Daily Mean PM2.5 Concentration    Units
       <char> <char>    <int> <int>                          <num>   <char>
1: 01/05/2002    AQS 60010007     1                           25.1 ug/m3 LC
2: 01/06/2002    AQS 60010007     1                           31.6 ug/m3 LC
3: 01/08/2002    AQS 60010007     1                           21.4 ug/m3 LC
4: 01/11/2002    AQS 60010007     1                           25.9 ug/m3 LC
5: 01/14/2002    AQS 60010007     1                           34.5 ug/m3 LC
6: 01/17/2002    AQS 60010007     1                           41.0 ug/m3 LC
   Daily AQI Value Local Site Name Daily Obs Count Percent Complete
             <int>          <char>           <int>            <num>
1:              81       Livermore               1              100
2:              93       Livermore               1              100
3:              74       Livermore               1              100
4:              82       Livermore               1              100
5:              98       Livermore               1              100
6:             115       Livermore               1              100
   AQS Parameter Code AQS Parameter Description Method Code
                <int>                    <char>       <int>
1:              88101  PM2.5 - Local Conditions         120
2:              88101  PM2.5 - Local Conditions         120
3:              88101  PM2.5 - Local Conditions         120
4:              88101  PM2.5 - Local Conditions         120
5:              88101  PM2.5 - Local Conditions         120
6:              88101  PM2.5 - Local Conditions         120
                      Method Description CBSA Code
                                  <char>     <int>
1: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
2: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
3: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
4: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
5: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
6: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
                           CBSA Name State FIPS Code      State
                              <char>           <int>     <char>
1: San Francisco-Oakland-Hayward, CA               6 California
2: San Francisco-Oakland-Hayward, CA               6 California
3: San Francisco-Oakland-Hayward, CA               6 California
4: San Francisco-Oakland-Hayward, CA               6 California
5: San Francisco-Oakland-Hayward, CA               6 California
6: San Francisco-Oakland-Hayward, CA               6 California
   County FIPS Code  County Site Latitude Site Longitude
              <int>  <char>         <num>          <num>
1:                1 Alameda      37.68753      -121.7842
2:                1 Alameda      37.68753      -121.7842
3:                1 Alameda      37.68753      -121.7842
4:                1 Alameda      37.68753      -121.7842
5:                1 Alameda      37.68753      -121.7842
6:                1 Alameda      37.68753      -121.7842
print (tail(data_table_2002))
         Date Source  Site ID   POC Daily Mean PM2.5 Concentration    Units
       <char> <char>    <int> <int>                          <num>   <char>
1: 12/10/2002    AQS 61131003     1                             15 ug/m3 LC
2: 12/13/2002    AQS 61131003     1                             15 ug/m3 LC
3: 12/22/2002    AQS 61131003     1                              1 ug/m3 LC
4: 12/25/2002    AQS 61131003     1                             23 ug/m3 LC
5: 12/28/2002    AQS 61131003     1                              5 ug/m3 LC
6: 12/31/2002    AQS 61131003     1                              6 ug/m3 LC
   Daily AQI Value      Local Site Name Daily Obs Count Percent Complete
             <int>               <char>           <int>            <num>
1:              62 Woodland-Gibson Road               1              100
2:              62 Woodland-Gibson Road               1              100
3:               6 Woodland-Gibson Road               1              100
4:              77 Woodland-Gibson Road               1              100
5:              28 Woodland-Gibson Road               1              100
6:              33 Woodland-Gibson Road               1              100
   AQS Parameter Code AQS Parameter Description Method Code
                <int>                    <char>       <int>
1:              88101  PM2.5 - Local Conditions         117
2:              88101  PM2.5 - Local Conditions         117
3:              88101  PM2.5 - Local Conditions         117
4:              88101  PM2.5 - Local Conditions         117
5:              88101  PM2.5 - Local Conditions         117
6:              88101  PM2.5 - Local Conditions         117
                      Method Description CBSA Code
                                  <char>     <int>
1: R & P Model 2000 PM2.5 Sampler w/WINS     40900
2: R & P Model 2000 PM2.5 Sampler w/WINS     40900
3: R & P Model 2000 PM2.5 Sampler w/WINS     40900
4: R & P Model 2000 PM2.5 Sampler w/WINS     40900
5: R & P Model 2000 PM2.5 Sampler w/WINS     40900
6: R & P Model 2000 PM2.5 Sampler w/WINS     40900
                                 CBSA Name State FIPS Code      State
                                    <char>           <int>     <char>
1: Sacramento--Roseville--Arden-Arcade, CA               6 California
2: Sacramento--Roseville--Arden-Arcade, CA               6 California
3: Sacramento--Roseville--Arden-Arcade, CA               6 California
4: Sacramento--Roseville--Arden-Arcade, CA               6 California
5: Sacramento--Roseville--Arden-Arcade, CA               6 California
6: Sacramento--Roseville--Arden-Arcade, CA               6 California
   County FIPS Code County Site Latitude Site Longitude
              <int> <char>         <num>          <num>
1:              113   Yolo      38.66121      -121.7327
2:              113   Yolo      38.66121      -121.7327
3:              113   Yolo      38.66121      -121.7327
4:              113   Yolo      38.66121      -121.7327
5:              113   Yolo      38.66121      -121.7327
6:              113   Yolo      38.66121      -121.7327
print ("2022 Data Header and Footer")
[1] "2022 Data Header and Footer"
print(head(data_table_2022))
         Date Source  Site ID   POC Daily Mean PM2.5 Concentration    Units
       <char> <char>    <int> <int>                          <num>   <char>
1: 01/01/2022    AQS 60010007     3                           12.7 ug/m3 LC
2: 01/02/2022    AQS 60010007     3                           13.9 ug/m3 LC
3: 01/03/2022    AQS 60010007     3                            7.1 ug/m3 LC
4: 01/04/2022    AQS 60010007     3                            3.7 ug/m3 LC
5: 01/05/2022    AQS 60010007     3                            4.2 ug/m3 LC
6: 01/06/2022    AQS 60010007     3                            3.8 ug/m3 LC
   Daily AQI Value Local Site Name Daily Obs Count Percent Complete
             <int>          <char>           <int>            <num>
1:              58       Livermore               1              100
2:              60       Livermore               1              100
3:              39       Livermore               1              100
4:              21       Livermore               1              100
5:              23       Livermore               1              100
6:              21       Livermore               1              100
   AQS Parameter Code AQS Parameter Description Method Code
                <int>                    <char>       <int>
1:              88101  PM2.5 - Local Conditions         170
2:              88101  PM2.5 - Local Conditions         170
3:              88101  PM2.5 - Local Conditions         170
4:              88101  PM2.5 - Local Conditions         170
5:              88101  PM2.5 - Local Conditions         170
6:              88101  PM2.5 - Local Conditions         170
                     Method Description CBSA Code
                                 <char>     <int>
1: Met One BAM-1020 Mass Monitor w/VSCC     41860
2: Met One BAM-1020 Mass Monitor w/VSCC     41860
3: Met One BAM-1020 Mass Monitor w/VSCC     41860
4: Met One BAM-1020 Mass Monitor w/VSCC     41860
5: Met One BAM-1020 Mass Monitor w/VSCC     41860
6: Met One BAM-1020 Mass Monitor w/VSCC     41860
                           CBSA Name State FIPS Code      State
                              <char>           <int>     <char>
1: San Francisco-Oakland-Hayward, CA               6 California
2: San Francisco-Oakland-Hayward, CA               6 California
3: San Francisco-Oakland-Hayward, CA               6 California
4: San Francisco-Oakland-Hayward, CA               6 California
5: San Francisco-Oakland-Hayward, CA               6 California
6: San Francisco-Oakland-Hayward, CA               6 California
   County FIPS Code  County Site Latitude Site Longitude
              <int>  <char>         <num>          <num>
1:                1 Alameda      37.68753      -121.7842
2:                1 Alameda      37.68753      -121.7842
3:                1 Alameda      37.68753      -121.7842
4:                1 Alameda      37.68753      -121.7842
5:                1 Alameda      37.68753      -121.7842
6:                1 Alameda      37.68753      -121.7842
print(tail(data_table_2022))
         Date Source  Site ID   POC Daily Mean PM2.5 Concentration    Units
       <char> <char>    <int> <int>                          <num>   <char>
1: 12/01/2022    AQS 61131003     1                            3.4 ug/m3 LC
2: 12/07/2022    AQS 61131003     1                            3.8 ug/m3 LC
3: 12/13/2022    AQS 61131003     1                            6.0 ug/m3 LC
4: 12/19/2022    AQS 61131003     1                           34.8 ug/m3 LC
5: 12/25/2022    AQS 61131003     1                           23.2 ug/m3 LC
6: 12/31/2022    AQS 61131003     1                            1.0 ug/m3 LC
   Daily AQI Value      Local Site Name Daily Obs Count Percent Complete
             <int>               <char>           <int>            <num>
1:              19 Woodland-Gibson Road               1              100
2:              21 Woodland-Gibson Road               1              100
3:              33 Woodland-Gibson Road               1              100
4:              99 Woodland-Gibson Road               1              100
5:              77 Woodland-Gibson Road               1              100
6:               6 Woodland-Gibson Road               1              100
   AQS Parameter Code AQS Parameter Description Method Code
                <int>                    <char>       <int>
1:              88101  PM2.5 - Local Conditions         145
2:              88101  PM2.5 - Local Conditions         145
3:              88101  PM2.5 - Local Conditions         145
4:              88101  PM2.5 - Local Conditions         145
5:              88101  PM2.5 - Local Conditions         145
6:              88101  PM2.5 - Local Conditions         145
                                      Method Description CBSA Code
                                                  <char>     <int>
1: R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
2: R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
3: R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
4: R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
5: R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
6: R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
                                 CBSA Name State FIPS Code      State
                                    <char>           <int>     <char>
1: Sacramento--Roseville--Arden-Arcade, CA               6 California
2: Sacramento--Roseville--Arden-Arcade, CA               6 California
3: Sacramento--Roseville--Arden-Arcade, CA               6 California
4: Sacramento--Roseville--Arden-Arcade, CA               6 California
5: Sacramento--Roseville--Arden-Arcade, CA               6 California
6: Sacramento--Roseville--Arden-Arcade, CA               6 California
   County FIPS Code County Site Latitude Site Longitude
              <int> <char>         <num>          <num>
1:              113   Yolo      38.66121      -121.7327
2:              113   Yolo      38.66121      -121.7327
3:              113   Yolo      38.66121      -121.7327
4:              113   Yolo      38.66121      -121.7327
5:              113   Yolo      38.66121      -121.7327
6:              113   Yolo      38.66121      -121.7327
print("Variables for 2002")
[1] "Variables for 2002"
print(colnames(data_2002))
 [1] "Date"                           "Source"                        
 [3] "Site ID"                        "POC"                           
 [5] "Daily Mean PM2.5 Concentration" "Units"                         
 [7] "Daily AQI Value"                "Local Site Name"               
 [9] "Daily Obs Count"                "Percent Complete"              
[11] "AQS Parameter Code"             "AQS Parameter Description"     
[13] "Method Code"                    "Method Description"            
[15] "CBSA Code"                      "CBSA Name"                     
[17] "State FIPS Code"                "State"                         
[19] "County FIPS Code"               "County"                        
[21] "Site Latitude"                  "Site Longitude"                
print("Variables for 2022")
[1] "Variables for 2022"
print(colnames(data_2022))
 [1] "Date"                           "Source"                        
 [3] "Site ID"                        "POC"                           
 [5] "Daily Mean PM2.5 Concentration" "Units"                         
 [7] "Daily AQI Value"                "Local Site Name"               
 [9] "Daily Obs Count"                "Percent Complete"              
[11] "AQS Parameter Code"             "AQS Parameter Description"     
[13] "Method Code"                    "Method Description"            
[15] "CBSA Code"                      "CBSA Name"                     
[17] "State FIPS Code"                "State"                         
[19] "County FIPS Code"               "County"                        
[21] "Site Latitude"                  "Site Longitude"                

Checking Variable Types

Variable Types for 2002

str(data_table_2002)
Classes 'data.table' and 'data.frame':  15976 obs. of  22 variables:
 $ Date                          : chr  "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Daily Mean PM2.5 Concentration: num  25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
 $ Units                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ Daily AQI Value               : int  81 93 74 82 98 115 89 62 69 107 ...
 $ Local Site Name               : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ Daily Obs Count               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Percent Complete              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS Parameter Code            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS Parameter Description     : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ Method Code                   : int  120 120 120 120 120 120 120 120 120 120 ...
 $ Method Description            : chr  "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" ...
 $ CBSA Code                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA Name                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ State FIPS Code               : int  6 6 6 6 6 6 6 6 6 6 ...
 $ State                         : chr  "California" "California" "California" "California" ...
 $ County FIPS Code              : int  1 1 1 1 1 1 1 1 1 1 ...
 $ County                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ Site Latitude                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ Site Longitude                : num  -122 -122 -122 -122 -122 ...
 - attr(*, ".internal.selfref")=<externalptr> 

Variable types for 2022

str(data_table_2022)
Classes 'data.table' and 'data.frame':  59756 obs. of  22 variables:
 $ Date                          : chr  "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Daily Mean PM2.5 Concentration: num  12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
 $ Units                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ Daily AQI Value               : int  58 60 39 21 23 21 13 38 59 55 ...
 $ Local Site Name               : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ Daily Obs Count               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Percent Complete              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS Parameter Code            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS Parameter Description     : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ Method Code                   : int  170 170 170 170 170 170 170 170 170 170 ...
 $ Method Description            : chr  "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" ...
 $ CBSA Code                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA Name                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ State FIPS Code               : int  6 6 6 6 6 6 6 6 6 6 ...
 $ State                         : chr  "California" "California" "California" "California" ...
 $ County FIPS Code              : int  1 1 1 1 1 1 1 1 1 1 ...
 $ County                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ Site Latitude                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ Site Longitude                : num  -122 -122 -122 -122 -122 ...
 - attr(*, ".internal.selfref")=<externalptr> 

Step 2

data_table_2002[, Date := as.Date(Date, format ="%m/%d/%Y")]
data_table_2022[, Date := as.Date(Date, format ="%m/%d/%Y")]
data_table_2002[, Year := year(Date)]
data_table_2022[, Year := year(Date)]
combined_data <- rbind(data_table_2002, data_table_2022)

Changing Variable Names

setnames(combined_data,
         old=c("Site ID","Site Longitude", "Site Latitude", "Daily Mean PM2.5 Concentration"),
         new=c("SiteID", "Longitude", "Latitude", "Mean PM2.5"))
head(combined_data)
         Date Source   SiteID   POC Mean PM2.5    Units Daily AQI Value
       <Date> <char>    <int> <int>      <num>   <char>           <int>
1: 2002-01-05    AQS 60010007     1       25.1 ug/m3 LC              81
2: 2002-01-06    AQS 60010007     1       31.6 ug/m3 LC              93
3: 2002-01-08    AQS 60010007     1       21.4 ug/m3 LC              74
4: 2002-01-11    AQS 60010007     1       25.9 ug/m3 LC              82
5: 2002-01-14    AQS 60010007     1       34.5 ug/m3 LC              98
6: 2002-01-17    AQS 60010007     1       41.0 ug/m3 LC             115
   Local Site Name Daily Obs Count Percent Complete AQS Parameter Code
            <char>           <int>            <num>              <int>
1:       Livermore               1              100              88101
2:       Livermore               1              100              88101
3:       Livermore               1              100              88101
4:       Livermore               1              100              88101
5:       Livermore               1              100              88101
6:       Livermore               1              100              88101
   AQS Parameter Description Method Code                    Method Description
                      <char>       <int>                                <char>
1:  PM2.5 - Local Conditions         120 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS
2:  PM2.5 - Local Conditions         120 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS
3:  PM2.5 - Local Conditions         120 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS
4:  PM2.5 - Local Conditions         120 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS
5:  PM2.5 - Local Conditions         120 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS
6:  PM2.5 - Local Conditions         120 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS
   CBSA Code                         CBSA Name State FIPS Code      State
       <int>                            <char>           <int>     <char>
1:     41860 San Francisco-Oakland-Hayward, CA               6 California
2:     41860 San Francisco-Oakland-Hayward, CA               6 California
3:     41860 San Francisco-Oakland-Hayward, CA               6 California
4:     41860 San Francisco-Oakland-Hayward, CA               6 California
5:     41860 San Francisco-Oakland-Hayward, CA               6 California
6:     41860 San Francisco-Oakland-Hayward, CA               6 California
   County FIPS Code  County Latitude Longitude  Year
              <int>  <char>    <num>     <num> <int>
1:                1 Alameda 37.68753 -121.7842  2002
2:                1 Alameda 37.68753 -121.7842  2002
3:                1 Alameda 37.68753 -121.7842  2002
4:                1 Alameda 37.68753 -121.7842  2002
5:                1 Alameda 37.68753 -121.7842  2002
6:                1 Alameda 37.68753 -121.7842  2002

Part 3

map <- leaflet() %>%
  addTiles() %>%
  addCircleMarkers(data = combined_data[Year == 2002],
                   lng = ~Longitude, lat = ~Latitude,
                   color = "blue", radius = 1, 
                   popup = ~paste("Site ID:", SiteID, "<br>County:", County)) %>%
  addCircleMarkers(data = combined_data[Year == 2022], 
                   lng = ~Longitude, lat = ~Latitude, 
                   color = "red", radius = 1, 
                   popup = ~paste("Site ID:", SiteID, "<br>County:", County))

map

Overall there seems to be more data points for the year 2022. There are fewer sites located in California that are blue (2002 data) on the map. They’re evenly dispersed around California but due to the increase in number of sites in 2022 they appear more clustered than the 2002 data.

Step 4

Number and Proportion of missing PM2.5 Concentrations

missing_PM2.5 <- sum(is.na(combined_data[["Mean PM2.5"]]))
print(missing_PM2.5)
[1] 0
missing_PM2.5/nrow(combined_data)
[1] 0

There is no temporal patterns in missing values since there were overall 0 missing values.

Number and Proportion of PM2.5 Concentrations < 0

negative_PM2.5 <- sum(combined_data[["Mean PM2.5"]]< 0)
print(negative_PM2.5)
[1] 215
negative_PM2.5/nrow(combined_data)
[1] 0.002838958

Since there are negative values, they will be categorized by year to see if there was a year that produced more negative PM2.5 concentration values.

negative_values_by_year <- combined_data[, .(negative_PM2.5 = sum(`Mean PM2.5` < 0)), by = Year]
total_negative_by_year <- combined_data[, .(total_count = .N), by = Year]
print(negative_values_by_year)
    Year negative_PM2.5
   <int>          <int>
1:  2002              0
2:  2022            215

Based on this analysis, it shows that 100% of the improbable data points were from the year 2022.

Question 6

State Level Analysis

state_stats <- combined_data[, .(
  mean_pm25 = mean(`Mean PM2.5`, na.rm = TRUE),
  median_pm25 = median(`Mean PM2.5`, na.rm = TRUE),
  sd_pm25 = sd(`Mean PM2.5`, na.rm = TRUE)
), by = Year]

print(state_stats)
    Year mean_pm25 median_pm25  sd_pm25
   <int>     <num>       <num>    <num>
1:  2002 16.115943        12.0 13.86737
2:  2022  8.428499         6.8  7.64424
boxplot(`Mean PM2.5` ~ Year, data = combined_data,
        main = "State-level PM2.5 Concentrations in California (2002 vs 2022)",
        xlab = "Year", ylab = "PM2.5 Concentration (µg/m³)",
        col = c("blue", "red"))

library(ggplot2)
ggplot(state_stats, aes(x = Year, y = mean_pm25)) +
  geom_line() +
  geom_point() +
  labs(title = "State-level Mean PM2.5 Concentrations in California (2002 vs 2022)",
       x = "Year", y = "Mean PM2.5 (µg/m³)") +
  theme_minimal()

Based on the box plots, the range of Mean PM2.5 Concentrations was wider in 2022, but the median was lower in 2022. Based on the line graph, it shows that the mean PM2.5 concentrations has overall decreased from 2002.

County Level Analysis

county_mean <- county_avg <- combined_data[, .(mean_pm25 = mean(`Mean PM2.5`, na.rm = TRUE)),
                            by = .(County, Year)]
ggplot(county_mean, aes(x = Year, y = mean_pm25, group = County, color = County)) +
  geom_line() +
  geom_point() +
  labs(title = "Mean PM2.5 Concentrations by County (2002 vs 2022)",
       x = "Year", y = "Mean PM2.5 (µg/m³)") +
  theme_minimal()

From the line graphs for every county, there is a mix of both increases and decreases in Mean PM2.5 concentrations. The distribution of mean PM2.5 concentrations by county seem to be more condensed in 2022.

LA_County <- combined_data[County == "Los Angeles"]
LA_site_avg <- LA_County[, .(mean_pm25 = mean(`Mean PM2.5`, na.rm = TRUE)),
                    by = .(SiteID, Year)]
print(LA_site_avg)
      SiteID  Year mean_pm25
       <int> <int>     <num>
 1: 60370002  2002 20.764307
 2: 60371002  2002 23.969672
 3: 60371103  2002 21.967945
 4: 60371201  2002 18.854167
 5: 60371301  2002 23.345082
 6: 60371601  2002 23.886441
 7: 60372005  2002 20.290909
 8: 60374002  2002 19.471067
 9: 60379033  2002 10.383178
10: 60379034  2002  4.815596
11: 60370002  2022  9.717105
12: 60370016  2022  8.422466
13: 60371103  2022 11.582781
14: 60371201  2022 10.715768
15: 60371302  2022 12.989903
16: 60371602  2022 11.447458
17: 60372005  2022  9.094167
18: 60374002  2022  9.916364
19: 60374004  2022 11.970711
20: 60374008  2022 13.016551
21: 60374009  2022  8.852560
22: 60374010  2022 13.018082
23: 60376012  2022  9.144110
24: 60379033  2022  7.518391
25: 60379034  2022  3.497561
      SiteID  Year mean_pm25
plot(mean_pm25 ~ Year, data = LA_site_avg, type = "b",
     main = "Mean PM2.5 Concentrations by Site in Los Angeles County(2002 vs 2022)",
     xlab = "Year", ylab = "Mean PM2.5 (µg/m³)")

Based on this graph, it shows that the Mean PM2.5 concentration seems to have increased in LA County from 2002 to 2022. This differs from all counties plot above as well as the state level since those both showed 2022 overall decreasing.